WildfireData <- read.csv('final_wildfire.csv')
summary_nature=read.csv('summary_nature.csv')
summary_peoplecaused=read.csv('summary_peoplecaused.csv')
fire_budget <- read.csv("fire_suppression.csv")
Avg_Temp <- WildfireData$tair_day_livneh_vic
Avg_SoilMoisture <- WildfireData$soilmoist1_day_livneh_vic
Avg_Rainfall <- WildfireData$rainfall_day_livneh_vic
##Five Point Summary
temp=str_remove(fire_budget$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
fire_budget$Budget=as.numeric(temp)
temp=str_remove(WildfireData$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
WildfireData$Budget=as.numeric(temp)
xkablesummary(WildfireData)
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
| X | Year | DISCOVERY_DOY | Budget | DISCOVERY_DATE | STAT_CAUSE_CODE | STAT_CAUSE_DESCR | CONT_DATE | CONT_DOY | FIRE_SIZE | FIRE_SIZE_CLASS | STATE | existDay | tair_day_livneh_vic | month | soilmoist1_day_livneh_vic | rainfall_day_livneh_vic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Min. : 1 | Min. :1992 | Min. : 1.0 | Min. : 43800000 | Min. :2448622 | Min. : 1.000 | Length:189550 | Min. :2448622 | Min. : 1.0 | Min. : 0.00 | Length:189550 | Length:189550 | Min. : 0.00 | Min. :-3.95 | Min. : 1.000 | Min. :10.51 | Min. : 0.000 |
| Q1 | 1st Qu.: 47388 | 1st Qu.:1997 | 1st Qu.:164.0 | 1st Qu.: 85591000 | 1st Qu.:2450624 | 1st Qu.: 2.000 | Class :character | 1st Qu.:2451362 | 1st Qu.:169.0 | 1st Qu.: 0.10 | Class :character | Class :character | 1st Qu.: 0.00 | 1st Qu.:16.20 | 1st Qu.: 6.000 | 1st Qu.:11.27 | 1st Qu.: 0.005 |
| Median | Median : 94776 | Median :2003 | Median :202.0 | Median :166000000 | Median :2452860 | Median : 5.000 | Mode :character | Median :2453248 | Median :206.0 | Median : 0.25 | Mode :character | Mode :character | Median : 0.00 | Median :20.51 | Median : 7.000 | Median :12.09 | Median : 0.072 |
| Mean | Mean : 94776 | Mean :2003 | Mean :201.6 | Mean :205045692 | Mean :2452884 | Mean : 5.691 | NA | Mean :2453347 | Mean :203.6 | Mean : 67.24 | NA | NA | Mean : 1.05 | Mean :18.87 | Mean : 7.181 | Mean :12.97 | Mean : 0.404 |
| Q3 | 3rd Qu.:142163 | 3rd Qu.:2009 | 3rd Qu.:245.0 | 3rd Qu.:252000000 | 3rd Qu.:2455034 | 3rd Qu.: 9.000 | NA | 3rd Qu.:2455763 | 3rd Qu.:246.0 | 3rd Qu.: 1.00 | NA | NA | 3rd Qu.: 0.00 | 3rd Qu.:22.71 | 3rd Qu.: 9.000 | 3rd Qu.:13.80 | 3rd Qu.: 0.395 |
| Max | Max. :189550 | Max. :2015 | Max. :366.0 | Max. :608000000 | Max. :2457388 | Max. :13.000 | NA | Max. :2457388 | Max. :366.0 | Max. :315578.80 | NA | NA | Max. :3653.00 | Max. :28.71 | Max. :12.000 | Max. :28.52 | Max. :31.959 |
| NA | NA | NA | NA | NA | NA | NA | NA | NA’s :97642 | NA’s :97642 | NA | NA | NA | NA’s :97642 | NA’s :13859 | NA’s :13859 | NA’s :13859 | NA’s :13859 |
library(ggplot2)
library(gridExtra)
#Average Temperature
TempHist <- ggplot(WildfireData, aes(Avg_Temp))+
geom_histogram(binwidth = 0.5, bins = 100, col="black", fill="light blue 2") +
labs(x="Avg. Temp (C)", y="Frequency", title="HISTOGRAM: Average Temprature")
#Average Soil Moisture
SoilHist <- ggplot(WildfireData, aes(Avg_SoilMoisture))+
geom_histogram(binwidth = 0.5, bins = 100, col="black", fill="orange red 2") +
labs(x="Avg. Soil Moisture", y="Frequency", title="HISTOGRAM: Average Soil Moisture")
#Average Rainfall
RainHist <- ggplot(WildfireData, aes(Avg_Rainfall))+
geom_histogram(binwidth = 0.5, bins = 100, col="black", fill="green 3") +
labs(x="Avg. Rainfall", y="Frequency", title="HISTOGRAM: Average Rainfall")
#Wildfire Count by Year
CountHist <- ggplot(WildfireData, aes(Year))+
geom_histogram(binwidth = 0.10, bins = 100, col="black", fill="yellow", stat="count") +
labs(x="Years", y="Frequency of Wildfires", title="Wildfires count by year", )
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Histograms <- grid.arrange(TempHist, SoilHist, RainHist, CountHist, ncol=2, nrow=2)
## Warning: Removed 13859 rows containing non-finite values (stat_bin).
## Warning: Removed 13859 rows containing non-finite values (stat_bin).
## Warning: Removed 13859 rows containing non-finite values (stat_bin).
ggsave("Histograms.jpg", plot = Histograms)
## Saving 7 x 5 in image
##Bar Graphs
#Fire Size
FireBar <- ggplot(data = WildfireData, aes(x = FIRE_SIZE_CLASS)) +
geom_bar(col="black", fill="orange")+
labs(x="Fire Size Class", y="Frequency", title="Frequency of Wildfires by Size Classes")
#Years
YearsBar <- ggplot(data = WildfireData, aes(x = Year)) +
geom_bar(col="black", fill="yellow")+
labs(x="Years", y="Frequency", title="Frequency of Wildfires by Year")
#Budget
BudgetBar <- ggplot(data = WildfireData, aes(x = Budget)) +
geom_bar(col="black", fill="Pink 2")+
labs(x="Budget", y="Frequency", title="Frequency of Wildfires by Budget")
grid.arrange(FireBar, YearsBar, nrow=2)
##Pie Charts
lbls <- c("A", "B", "C", "D", "E", "F", "G", "E")
jpeg("sizeclasspie.jpeg")
sizeclasspie<- pie((table(WildfireData$FIRE_SIZE_CLASS)), col=rainbow(length(lbls)), main="Pie Chart of Fire Size Class")
jpeg("cause_descrpPie.jpeg")
lbls <- c("Lightning", "Eqipment Use", "Smoking", "Campfire", "Debris Burning", "Railroad", "Arson", "Children", "Misc." )
cause_descrpPie <- pie((table(WildfireData$STAT_CAUSE_DESCR)), col=rainbow(length(lbls)), main="Pie Chart of Wildfire Cause")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
yearly_count <- WildfireData %>% count(Year)
colnames(yearly_count) <- c("Year", "Count")
ggplot(yearly_count, aes(x=Year, y=Count, group=1)) + geom_line() + ggtitle("Yearly Recorded Fires")
temp=str_remove(fire_budget$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
fire_budget$Budget=as.numeric(temp)
ggplot(fire_budget, aes(x=Year, y=Budget, group=1)) + geom_line() + ggtitle("California Fire Suppression Budget 1979-2021")
dat <- aggregate(FIRE_SIZE ~ Year, WildfireData, mean)
ggplot(dat, aes(x=Year, y=FIRE_SIZE, group=1)) + geom_line() + ggtitle("Wildfire Sizes (1992-2013)") + ylab("Fire Size")
Hu Zhongyang part intialize data
final_fire=read.csv('final_wildfire.csv')
summary_nature=read.csv('summary_nature.csv')
summary_peoplecaused=read.csv('summary_peoplecaused.csv')
colnames(summary_nature)[4]='temperature'
colnames(summary_nature)[5]='soilmoisture'
colnames(summary_nature)[6]='rainfall'
colnames(summary_nature)[7]='nfire'
colnames(summary_peoplecaused)[4]='temperature'
colnames(summary_peoplecaused)[5]='soilmoisture'
colnames(summary_peoplecaused)[6]='rainfall'
colnames(summary_peoplecaused)[7]='nfire'
summary_peoplecaused$Year=as.factor(summary_peoplecaused$Year)
summary_peoplecaused$month=as.factor(summary_peoplecaused$month)
summary_nature$Year=as.factor(summary_nature$Year)
summary_nature$month=as.factor(summary_nature$month)
plot the year trend
library(ggplot2)
temp_plot=aggregate(nfire~Year,summary_nature,sum)
temp_plot2=aggregate(nfire~Year,summary_peoplecaused,sum)
ggplot() +geom_point(data=temp_plot, aes(x=Year, y=nfire), colour='blue') + geom_point(data=temp_plot2, aes(x=Year, y=nfire),colour='red')+labs(title='Number of Fires Each Year (Red for people-caused, Blue for other reasons)',y='Number of Fires')
plot the boxplot of the year and month to show trend
library(ggpubr)
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
ggplot(summary_peoplecaused, mapping=aes(x=Year,y=nfire)) + geom_boxplot()+ggtitle('box-plot of number of people-caused fires for different years')+ylab('Number of Fires')
ggplot(summary_peoplecaused, mapping=aes(x=month,y=nfire)) + geom_boxplot()+ggtitle('box-plot of number of people-caused fires for different months')+ylab('Number of Fires')
ggplot(summary_nature, mapping=aes(x=Year,y=nfire)) + geom_boxplot()+ggtitle('box-plot of number of fires caused by other reasons for different years')+ylab('Number of Fires')
ggplot(summary_nature, mapping=aes(x=month,y=nfire)) + geom_boxplot()+ggtitle('box-plot of number of fires caused by other reasons for different months')+ylab('Number of Fires')
ggplot(summary_peoplecaused, mapping=aes(x=Year,y=temperature)) + geom_boxplot()+ggtitle('box-plot of temperature for different years')+ylab('temperature')
ggplot(summary_peoplecaused, mapping=aes(x=month,y=temperature)) + geom_boxplot()+ggtitle('box-plot of temperature for different months')+ylab('temperature')
ggplot(summary_nature, mapping=aes(x=Year,y=soilmoisture)) + geom_boxplot()+ggtitle('box-plot of soil moisture for different years')+ylab('soil moisture')
ggplot(summary_nature, mapping=aes(x=month,y=soilmoisture)) + geom_boxplot()+ggtitle('box-plot of soil moisture for different months')+ylab('soil moisture')
ggplot(summary_nature, mapping=aes(x=Year,y=rainfall)) + geom_boxplot()+ggtitle('box-plot of rainfall for different years')+ylab('average daily rainfall')
ggplot(summary_nature, mapping=aes(x=month,y=rainfall)) + geom_boxplot()+ggtitle('box-plot of rainfall for different months')+ylab('average daily rainfall')
Setting up different groups for the first hypothesis test. We are looking to compare different classes of wildfires and how different conditions may have been.
classA <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'A',]
classB <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'B',]
classC <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'C',]
classD <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'D',]
classE <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'E',]
classF <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'F',]
classG <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'G',]
When comparing the conditions during the smallest wildfires to the largest wildfires, it appears that air temperature was lower, soil moisture was higher, and rainfall was higher during less intense wildfires.
t.test(classA$tair_day_livneh_vic, classG$tair_day_livneh_vic, var.equal = TRUE)
##
## Two Sample t-test
##
## data: classA$tair_day_livneh_vic and classG$tair_day_livneh_vic
## t = -8.7583, df = 89797, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.195132 -2.026586
## sample estimates:
## mean of x mean of y
## 18.70072 21.31158
t.test(classA$soilmoist1_day_livneh_vic, classG$soilmoist1_day_livneh_vic, var.equal = TRUE)
##
## Two Sample t-test
##
## data: classA$soilmoist1_day_livneh_vic and classG$soilmoist1_day_livneh_vic
## t = 8.175, df = 89797, p-value = 2.998e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.8259187 1.3468515
## sample estimates:
## mean of x mean of y
## 13.08929 12.00291
t.test(classA$rainfall_day_livneh_vic, classG$rainfall_day_livneh_vic, var.equal = TRUE)
##
## Two Sample t-test
##
## data: classA$rainfall_day_livneh_vic and classG$rainfall_day_livneh_vic
## t = 4.4724, df = 89797, p-value = 7.744e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1484135 0.3799739
## sample estimates:
## mean of x mean of y
## 0.4636929 0.1994992
code1 <- WildfireData[WildfireData$STAT_CAUSE_CODE == 1,]
code2 <- WildfireData[WildfireData$STAT_CAUSE_CODE == 2,]
code7 <- WildfireData[WildfireData$STAT_CAUSE_CODE == 7,]
Now, we will compare different groups of wildfires- categorized by their causes
Code 1: Lightning Code 2: Equipment Use Code 7: Arson
When looking at the wildfires caused by Lightning versus those caused by Equipment Use, average temperature, soil moisture, and rainfall in CA were significantly different. In particular, during Lighting-caused wildfires, air temperature was higher, soil moisture was lower, and rainfall was higher.
t.test(code1$tair_day_livneh_vic, code2$tair_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code1$tair_day_livneh_vic and code2$tair_day_livneh_vic
## t = 85.583, df = 62967, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.936384 3.074032
## sample estimates:
## mean of x mean of y
## 22.10035 19.09514
t.test(code1$soilmoist1_day_livneh_vic, code2$soilmoist1_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code1$soilmoist1_day_livneh_vic and code2$soilmoist1_day_livneh_vic
## t = -38.009, df = 62967, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6264540 -0.5650133
## sample estimates:
## mean of x mean of y
## 12.19398 12.78972
t.test(code1$rainfall_day_livneh_vic, code2$rainfall_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code1$rainfall_day_livneh_vic and code2$rainfall_day_livneh_vic
## t = 30.919, df = 62967, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2152689 0.2444081
## sample estimates:
## mean of x mean of y
## 0.5694914 0.3396529
When comparing the wildfires caused by Lightning versus those caused by Arson, it appears that the air temperature, soil moisture, and average rainfall in CA were significantly different. In particular, during lightning-caused wildfires, air temperature was higher, soil moisture was lower, and average rainfall was higher.
t.test(code1$tair_day_livneh_vic, code7$tair_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code1$tair_day_livneh_vic and code7$tair_day_livneh_vic
## t = 82.448, df = 43091, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.208857 3.365139
## sample estimates:
## mean of x mean of y
## 22.10035 18.81335
t.test(code1$soilmoist1_day_livneh_vic, code7$soilmoist1_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code1$soilmoist1_day_livneh_vic and code7$soilmoist1_day_livneh_vic
## t = -39.235, df = 43091, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.7279409 -0.6586716
## sample estimates:
## mean of x mean of y
## 12.19398 12.88729
t.test(code1$rainfall_day_livneh_vic, code7$rainfall_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code1$rainfall_day_livneh_vic and code7$rainfall_day_livneh_vic
## t = 26.917, df = 43091, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2019315 0.2336487
## sample estimates:
## mean of x mean of y
## 0.5694914 0.3517013
When looking at conditions during arson-caused wildfires versus equipment use-caused wildfires, it appears that air temperature and soil moisture were significantly different. In particular, during arson-caused fires, air temperature was lower and soil moisture was higher.
t.test(code7$tair_day_livneh_vic, code2$tair_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code7$tair_day_livneh_vic and code2$tair_day_livneh_vic
## t = -6.3054, df = 56768, p-value = 2.895e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3693827 -0.1941972
## sample estimates:
## mean of x mean of y
## 18.81335 19.09514
t.test(code7$soilmoist1_day_livneh_vic, code2$soilmoist1_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code7$soilmoist1_day_livneh_vic and code2$soilmoist1_day_livneh_vic
## t = 4.7886, df = 56768, p-value = 1.684e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05763517 0.13750998
## sample estimates:
## mean of x mean of y
## 12.88729 12.78972
t.test(code7$rainfall_day_livneh_vic, code2$rainfall_day_livneh_vic, var.equal=TRUE)
##
## Two Sample t-test
##
## data: code7$rainfall_day_livneh_vic and code2$rainfall_day_livneh_vic
## t = 1.3998, df = 56768, p-value = 0.1616
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.004821767 0.028918604
## sample estimates:
## mean of x mean of y
## 0.3517013 0.3396529
Do Anova test on the year and month statistics
summary(aov(nfire~Year,summary_peoplecaused))
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 21 360393 17162 0.849 0.657
## Residuals 242 4892242 20216
summary(aov(nfire~month,summary_peoplecaused))
## Df Sum Sq Mean Sq F value Pr(>F)
## month 11 4167871 378897 88.02 <2e-16 ***
## Residuals 252 1084764 4305
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(nfire~Year,summary_nature))
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 21 1352086 64385 0.608 0.911
## Residuals 242 25626302 105894
summary(aov(nfire~month,summary_nature))
## Df Sum Sq Mean Sq F value Pr(>F)
## month 11 21164133 1924012 83.39 <2e-16 ***
## Residuals 252 5814256 23072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(temperature~Year,summary_peoplecaused))
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 21 31 1.47 0.033 1
## Residuals 242 10855 44.85
summary(aov(temperature~month,summary_peoplecaused))
## Df Sum Sq Mean Sq F value Pr(>F)
## month 11 10458 950.7 560.1 <2e-16 ***
## Residuals 252 428 1.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(soilmoisture~Year,summary_nature))
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 21 129.9 6.185 0.491 0.972
## Residuals 242 3048.5 12.597
summary(aov(soilmoisture~month,summary_nature))
## Df Sum Sq Mean Sq F value Pr(>F)
## month 11 2616.9 237.90 106.8 <2e-16 ***
## Residuals 252 561.5 2.23
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(rainfall~Year,summary_nature))
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 21 34.08 1.623 1.458 0.0931 .
## Residuals 242 269.34 1.113
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(rainfall~month,summary_nature))
## Df Sum Sq Mean Sq F value Pr(>F)
## month 11 95.17 8.652 10.47 8.85e-16 ***
## Residuals 252 208.25 0.826
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
try to make correlation check with numeric variable
temp=str_remove(summary_nature$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
summary_nature$Budget=as.numeric(temp)
temp=str_remove(summary_peoplecaused$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
summary_peoplecaused$Budget=as.numeric(temp)
cor_nature=cor(summary_nature[c(4:9)])
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor_nature,method='number')
cor_people=cor(summary_peoplecaused[c(4:9)])
corrplot(cor_people,method='number')
summary_nature$total=summary_nature$n+summary_peoplecaused$n
cor_total=cor(summary_nature[c(4,5,6,8,9,10)])
corrplot(cor_total,method='number',type = 'lower', diag = TRUE)
create model for nature reason and check their summary, use vif to determine the variable useage
use residual plot and qq-plot to check their normality
model1=lm(total~temperature,data=summary_nature)
summary(model1)
##
## Call:
## lm(formula = total ~ temperature, data = summary_nature)
##
## Residuals:
## Min 1Q Median 3Q Max
## -487.32 -156.40 -17.98 115.94 786.54
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -364.702 31.780 -11.48 <2e-16 ***
## temperature 60.159 2.077 28.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 216.7 on 262 degrees of freedom
## Multiple R-squared: 0.7621, Adjusted R-squared: 0.7612
## F-statistic: 839.3 on 1 and 262 DF, p-value: < 2.2e-16
model2=lm(total~temperature+soilmoisture,data=summary_nature)
plot(model2)
summary(model2)
##
## Call:
## lm(formula = total ~ temperature + soilmoisture, data = summary_nature)
##
## Residuals:
## Min 1Q Median 3Q Max
## -451.32 -143.28 -25.17 113.08 777.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 210.385 160.014 1.315 0.189733
## temperature 48.050 3.878 12.389 < 2e-16 ***
## soilmoisture -26.296 7.178 -3.664 0.000301 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 211.7 on 261 degrees of freedom
## Multiple R-squared: 0.7737, Adjusted R-squared: 0.772
## F-statistic: 446.2 on 2 and 261 DF, p-value: < 2.2e-16
model3=lm(total~temperature+soilmoisture+rainfall,data=summary_nature)
summary(model3)
##
## Call:
## lm(formula = total ~ temperature + soilmoisture + rainfall, data = summary_nature)
##
## Residuals:
## Min 1Q Median 3Q Max
## -436.00 -146.77 -19.03 108.12 770.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 295.033 171.473 1.721 0.086517 .
## temperature 47.261 3.915 12.070 < 2e-16 ***
## soilmoisture -32.356 8.441 -3.833 0.000159 ***
## rainfall 22.824 16.798 1.359 0.175424
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 211.4 on 260 degrees of freedom
## Multiple R-squared: 0.7753, Adjusted R-squared: 0.7727
## F-statistic: 299.1 on 3 and 260 DF, p-value: < 2.2e-16
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(model3)
## temperature soilmoisture rainfall
## 3.735761 5.069232 1.916655
vif(model2)
## temperature soilmoisture
## 3.653691 3.653691
model4=lm(total~soilmoisture,data=summary_nature)
summary(model4)
##
## Call:
## lm(formula = total ~ soilmoisture, data = summary_nature)
##
## Residuals:
## Min 1Q Median 3Q Max
## -592.40 -176.14 -37.79 118.51 957.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2050.416 74.892 27.38 <2e-16 ***
## soilmoisture -102.079 4.723 -21.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 266.3 on 262 degrees of freedom
## Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
## F-statistic: 467.1 on 1 and 262 DF, p-value: < 2.2e-16
model5=lm(log(total)~temperature+soilmoisture,data=summary_nature)
residualPlot(model5)
plot(model5)
summary(model5)
##
## Call:
## lm(formula = log(total) ~ temperature + soilmoisture, data = summary_nature)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92313 -0.27901 -0.02392 0.23857 1.31414
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.44582 0.30364 27.815 < 2e-16 ***
## temperature 0.05834 0.00736 7.926 6.59e-14 ***
## soilmoisture -0.23920 0.01362 -17.562 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4017 on 261 degrees of freedom
## Multiple R-squared: 0.8949, Adjusted R-squared: 0.8941
## F-statistic: 1112 on 2 and 261 DF, p-value: < 2.2e-16
vif(model5)
## temperature soilmoisture
## 3.653691 3.653691